Getting names data

This data comes from kaggle.com, which has collected all recorded names for children born in the United States between the years 1880-2014.

## first read in baby names csv
## names <- read.csv('Baby-Name-Project/data/raw_data/NationalNames.csv')
## save as RDS and remove CSV to save space
names <- readRDS('Baby-Name-Project/data/raw_data/all-names.rds')

## reading in baby names by stat
## state <- read.csv('Baby-Name-Project/data/raw_data/StateNames.csv')
## save as RDS file and remove CSV
state <- readRDS('Baby-Name-Project/data/raw_data/state-names.rds')

Top 10 boys names

## summing totals and organizing into descending order (most to least)
descending <- names %>%
  filter(Gender == 'M') %>%
  group_by(Name) %>%
  select(Name, Count) %>%
  summarise(total=sum(Count)) %>%
  arrange(desc(total))

## making table that displays top 10 names
top10 = descending[1:10,]

kable(top10, digits=0, 'html', caption = 'Top 10 Names for Baby Boys, US 1880-2014') %>% 
  kableExtra::kable_styling('striped', 'bordered') %>%
  kableExtra::footnote(general='Kaggle.com', general_title = 'Source: ', footnote_as_chunk = T)
Top 10 Names for Baby Boys, US 1880-2014
Name total
James 5105919
John 5084943
Robert 4796695
Michael 4309198
William 4055473
David 3577704
Joseph 2570095
Richard 2555330
Charles 2364332
Thomas 2283080
Source: Kaggle.com

Limiting to boys named Nathan and Nathan-realted Names

## filter out only babies with names 'Nathan' or 'Nate' and 'Male'
## this is accomplished using the filter() function available through dplyr package
dnn <- names %>% 
  filter(Gender == 'M', 
         Name == 'Nathan' | Name == 'Nate' | Name == 'Nathanial' | Name == 'Nathaniel' | Name == 'Nathanael')

## filtering out all other names for each state
state_dnn <- state %>% 
  filter(Gender == 'M', Name == 'Nathan')

## summing total number of Nathan's for each state
state_dnn_sum <- state_dnn %>% 
  group_by(State) %>%
  select(Name, state=State, Count) %>%
  summarize(total=sum(Count))

## getting total number of people per state
sum_names_state <- state %>%
  group_by(State) %>%
  select(state=State, Count) %>%
  summarize(total=sum(Count))

## getting proportion of Nathans for each state
## use set_colnames to alter header bc gets changed
prop_N <- data.frame(sum_names_state$state, (state_dnn_sum$total/sum_names_state$total)) %>%
  magrittr::set_colnames(c('state', 'total')) %>% 
  (function(x){
    df <- data_frame(state=x$state, prop = x$total/max(x$total))})

## summing total Nathaniels by state and filtering
state_dnnat <- state %>% 
  filter(Gender == 'M', Name == 'Nathaniel')

state_dnnat_sum <- state_dnnat %>% 
  group_by(State) %>%
  select(Name, state=State, Count) %>%
  summarize(total=sum(Count))

prop_Nat <- data.frame(sum_names_state$state, (state_dnnat_sum$total/sum_names_state$total)) %>%
  magrittr::set_colnames(c('state', 'total')) %>% 
  (function(x){
    df <- data_frame(state=x$state, prop = x$total/max(x$total))})

Creating interactive plot

## cleaner looking graph with annotations
p1 <- ggplot(data=dnn, aes(x=Year, y=Count, color=Name)) + 
  geom_line(size=1) + 
  labs(title='Baby boys with Nathan-related names',
       subtitle='United States, 1880-2014', 
       caption='Source: www.kaggle.com') + 
  ylab('Number of babies') +
  scale_x_continuous(breaks=seq(1880,2014, by=10)) +
  annotate('rect', xmin=1989, xmax=1991, ymin=0, ymax=Inf, fill= 'cadetblue3', alpha=0.6) + 
  annotate('text', label = '1990', y=12500, x=1988, size=5, hjust='right') + 
  theme(
    plot.title = element_text(face='bold', size = 16),
    plot.subtitle = element_text(size=13),
    plot.margin = unit(c(1,1,1,1), 'lines'),
    axis.text = element_text(size=10, color='black'),
    axis.title.y = element_text(size=12, face='bold', margin = margin(t=0,r=10,b=0,l=0)),
    axis.title.x = element_text(size=12, face='bold', margin = margin(t=10,r=0,b=0,l=0)),
    legend.position = c(0.15,0.7),
    legend.text = element_text(size=12),
    legend.title = element_text(size=12, face='bold'),
    legend.background = element_rect(fill='white', size=0.5, linetype='solid', color='black')
  )

p1

Interactive Plot

## interactive plot
p2 <- ggplot(data=dnn, aes(x=Year, y=Count, color=Name)) + 
  geom_line() + 
  geom_point() + 
  ylab('Number of babies') + 
  theme(
    plot.margin = unit(c(1,1,1,1), 'lines')
  )

## in order to make interactive, we will view the plot using ggplotly() function.
ggplotly(p2)
## you should be able to hover mouse over individual points to see count and year
## if you don't want the graph to show up inside R markdown file:
    ## click on gear aside of knit at top
    ## select 'Chuck Output in Console'

Mapping total number of Nathans by state

Making a heatplot for proportion of Nathans born in each state

p3 <- plot_usmap(data=prop_N, values = 'prop') + 
  scale_fill_gradient(name = 'Proportion', low='blue', high='red') + 
  labs(title='Proportion of Babies Named Nathan By State', 
       subtitle = 'US babies born 1880-2014', 
       caption='Source: Kaggle.com') + 
  theme(
    legend.position = 'right',
    legend.title = element_text(size=11, face='bold'),
    legend.text = element_text(size=9),
    plot.title = element_text(size=16, face='bold'),
    plot.subtitle = element_text(size=13),
    plot.caption = element_text(size=9)
  )

p3

Mapping total Nathaniels by State

p4 <- plot_usmap(data=prop_Nat, values = 'prop') + 
  scale_fill_gradient(name = 'Proportion', low='blue', high='red') + 
  labs(title='Proportion of Babies Named Nathaniel By State', 
       subtitle = 'US babies born 1880-2014', 
       caption='Source: Kaggle.com') + 
  theme(
    legend.position = 'right',
    legend.title = element_text(size=11, face='bold'),
    legend.text = element_text(size=9),
    plot.title = element_text(size=16, face='bold'),
    plot.subtitle = element_text(size=13),
    plot.caption = element_text(size=9)
  )

p4